package com.kdtech.analyse.news;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.analyse.tool.ParseLogic;
import com.kdtech.analyse.tool.ParseTool;
import com.kdtech.analyse.tool.SelectType;
import com.kdtech.analyse.tool.SubstrLogic;
import com.kdtech.analyse.tool.SubstrType;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.StringUtils;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;
/**
 * http://www.sina.com.cn/ 新浪网
 * @author allen
 */
public class SinaNewsAnalyse implements AnalyseNews {


	
	public boolean isDetailPage(String url) {
		boolean bRet=false;
		String[] regex={
				"http://[a-zA-Z0-9_\\.]*.sina.com.cn/.*[0-9]{4}[-|/][0-9]{2}[-|/][0-9]{2}/.*.html"
				,"http://[a-zA-Z0-9_\\.]*.sina.com.cn/.*/[0-9]{8}/.*.html"
				,"http://[a-zA-Z0-9_\\.]*.sina.com.cn/.*/[0-9]{4}/[0-9]{4}/.*.shtml"
				,"http://[a-zA-Z0-9_\\.]*.sina.com.cn/.*/[0-9]{4}[-|/][0-9]{2}[-|/][0-9]{2}/pid_[0-9]+.htm",
				"http://[a-zA-Z0-9_\\.]*.sina.com.cn/.*/vReport_Show/.*/rptid/[0-9]*/index.phtml",
				"http://[a-zA-Z0-9_\\.]*.sina.com.cn/.*vCB_AllBulletinDetail.php\\?id=[0-9]*&stockid=[0-8]*",
				"http://[a-zA-Z0-9_\\.]*.sina.com/.*/.*/[0-9]*/[0-9]*.shtml"
		};
		for (int i=0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return bRet;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {

		String html=urlMeta.getHtml();
		if (html == null) {
		}
		String title="";
		String content="";;
		String author="";
		String url=urlMeta.getUrl();
		Long date=null;
		NewsMeta meta=new NewsMeta();
		meta.setUrl(url);
		Document doc=Jsoup.parse(html);

		doc.select("div.pagination,#_function_code_page,.today-hot,.border_rlb,.side,.dfz-nav").remove();
		doc.select("div.main-body p:containsOwn(相关报道：)").remove();
		doc.select("div.main-body p:containsOwn(更多视频资讯，请关注)").remove();
		doc.select("div#divContent ul,p.related").remove();

		urlMeta=null;
		/*
		 * 解析新闻标题
		 */
		ParseTool tool=new ParseTool(doc);
		tool.addParseLogic(new ParseLogic("#artibodyTitle"));
		tool.addParseLogic(new ParseLogic("div.tagmain th"));
		tool.addParseLogic(new ParseLogic("div.article-title > h2"));
		tool.addParseLogic(new ParseLogic("div.d_title h1"));
		tool.addParseLogic(new ParseLogic("title",new SubstrLogic(SubstrType.before,"_")));
		tool.addParseLogic(new ParseLogic("h1"));
		tool.addParseLogic(new ParseLogic("h2"));
		title=tool.parse();


		tool.addParseLogic(new ParseLogic("span#pub_date"));
		tool.addParseLogic(new ParseLogic("h2.z_font2"));
		tool.addParseLogic(new ParseLogic("p.from span em"));
		tool.addParseLogic(new ParseLogic("div.zwdate"));
		tool.addParseLogic(new ParseLogic("div.d_info"));
		tool.addParseLogic(new ParseLogic("div.creab"));
		tool.addParseLogic(new ParseLogic("td.graybgH2"));
		date=tool.parseDate();
		if (date==null){
			date=DateUtils.matchDate(url);
		}


		tool=new ParseTool(doc);
		doc.select("span:containsOwn(声明：本文仅代表作者观点，不代表新浪网立场)").remove();
		tool.addParseLogic(new ParseLogic("div#artibody"));
		tool.addParseLogic(new ParseLogic("div.main-body"));
		tool.addParseLogic(new ParseLogic("div#show_txt"));
		tool.addParseLogic(new ParseLogic("p.intro em.vdiCont"));
		tool.addParseLogic(new ParseLogic("div#textCont"));
		tool.addParseLogic(new ParseLogic("div#divContent"));
		tool.addParseLogic(new ParseLogic("div.blk_container"));
		tool.addParseLogic(new ParseLogic("div#content"));
		content=tool.parse();
		content=StringUtils.trimSpace(content);
		if(content.startsWith("视频加载中，请稍候...自动播放play")){
			content=content.replace("视频加载中，请稍候...自动播放play", "");
		}



		/*
		 * 解析作者
		 */
		author=doc.select("div.artInfo >span:eq(1) a").text();
		if(StringUtils.isBlank(author)){
	      Elements select=doc.select("div.zwcontent div.tc span span.linkRed02");
	      if(select!=null && select.size()!=0){
	    	  author=select.get(0).text();
	      }
	      if(StringUtils.isBlank(author)){
	    	  author= doc.select("div.Main div.blkContainer div#J_Article_Wrap div.blkContainerSblk div.artInfo span[data-sudaclick=media_name]").text();
	      }
	      if(StringUtils.isBlank(author)){
	    	  author= doc.select("a[data-sudaclick=media_name]").text();
	      }
	      if(StringUtils.isBlank(author)){
	    	  author= doc.select("#media_name").text();
	      }
	      if(StringUtils.isBlank(author)){
	    	  author= JSoupUtils.matchAuthor(doc, "机构：");
	      }
		}
		if(StringUtils.isBlank(content)){
			content=doc.select("#artCon").html();
		}
		meta.setTitle(title);
		meta.setContent(content);
		meta.setDate(date);
		meta.setAuthor(author);

		return meta;




	}
	
	public NewsMeta Update(NewsMeta meta) {
		return null;
	}
	public static void main(String[] args) {
		SinaNewsAnalyse a=new SinaNewsAnalyse();
		String url="http://news.sina.com.cn/c/nd/2016-10-19/doc-ifxwvpar8409653.shtml#2285288";
		UrlMeta meta=CrawlHTML.responseToURL(url);
		System.out.println(a.isDetailPage(url));
		NewsMeta parserHtml=a.parserHtml(meta);
		System.out.println(parserHtml);
	}


	
	public boolean isNeedUpdate(){
		return false;
	}
}
